# Import required libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Read the CSV file 
df = pd.read_csv("lfw_arnie_nonarnie.csv")
df.head()

# Seperate the predictor and class label
X = df.drop('Label', axis=1)
y = df['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

# Store initialized models in a dictionary
models = {"LogisticRegression": LogisticRegression(), 
          "KNeighborsClassifier": KNeighborsClassifier(),
          "DecisionTreeClassifier": DecisionTreeClassifier()}



# Store the model parameters in a dictionary
param_grid = {"LogisticRegression": {"LogisticRegression__C": [0.01, 0.1, 1, 10]},
              "KNeighborsClassifier": {"KNeighborsClassifier__n_neighbors": range(1,10)},
              "DecisionTreeClassifier": {"DecisionTreeClassifier__max_depth": [2, 5, 10],
           "DecisionTreeClassifier__min_samples_split": [2, 5, 10, 20],
           "DecisionTreeClassifier__random_state": [42]}}



# Define cross-validation parameters
kf = KFold(n_splits=5, random_state=42, shuffle=True)



# Prepare to collect Grid Search CV results
pipe_accuracies = {}
pipe_params = {}
pipelines = {}

# Create separate pipelines for each model, loop through the models and perform GridSearchCV
for name, model in models.items():
    pipeline = Pipeline(steps=[
        ("scaler", StandardScaler()),
        (name, model)
    ])
    # Create the GridSearchCV object
    grid_search = GridSearchCV(pipeline, param_grid[name], cv=kf, scoring="accuracy")
    
    # Perform grid search and fit the model and store the results
    grid_search.fit(X_train, y_train)
    pipe_accuracies[name] = grid_search.best_score_
    pipe_params[name] = grid_search.best_params_
    pipelines[name] = grid_search

# Select the best model based on the best cross-validation score
best_model_name = max(pipe_accuracies)
best_model_cv_score = max(pipe_accuracies.values())
best_model_info = pipe_params[best_model_name]

# Print the best model name, parameters, and CV score
print(f"Best Model: {best_model_name}")
print(f"Best Model Parameters: {best_model_info}")
print(f"Best Model CV Score: {best_model_cv_score}")

Best Model: LogisticRegression
Best Model Parameters: {'LogisticRegression__C': 1}
Best Model CV Score: 0.8288172043010752

# Compute and print key performance metrics
y_pred = pipelines[best_model_name].predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.8158
Precision: 1.0000
Recall: 0.1250
F1 Score: 0.2222

Column Name	Description
`PC1`, `PC2`, ..., `PCN`	These are principal components (from PCA), which represent important features of the face images.
`Label`	`1` = Arnold Schwarzenegger `0` = Someone else

	0	1	2	3	4	5	6	7	8	9	...	141	142	143	144	145	146	147	148	149	Label
0	-2.061987	0.581320	-0.249115	-0.631340	-1.359899	0.751619	-0.029364	1.610456	0.341402	1.286709	...	-0.356844	-0.016488	-0.228473	0.258134	0.046834	0.135742	-0.068297	0.022038	0.090003	1
1	-0.796838	-0.667228	-0.107889	0.019755	-0.686348	0.912779	0.463412	-0.238308	-0.294023	0.215470	...	-0.037243	-0.012105	-0.351285	-0.034968	0.192314	-0.015406	-0.089117	0.023588	-0.019998	1
2	5.376779	1.142695	2.543111	-2.727212	0.272785	-0.972187	1.111221	1.645502	-2.556968	-0.648781	...	0.157441	-0.333875	-0.303720	-0.085975	0.171346	0.128577	-0.118262	0.045881	-0.190158	1
3	7.029235	1.242883	-2.628079	1.224479	-1.141370	-1.620647	0.205890	1.567561	0.736200	0.010782	...	0.051040	-0.068796	0.141841	-0.227999	0.046044	0.013643	-0.125893	0.146396	0.013320	1
4	5.484822	6.752706	-4.291114	1.740412	-1.603087	-1.075175	1.919936	-0.197615	1.030596	1.451936	...	0.034412	0.265141	0.226000	0.032064	-0.113654	0.059126	-0.216803	0.025849	0.020456	1

Project Description¶

Goal¶

The Dataset¶

It contains:¶

Columns:¶

What We’ll Do¶

Final Outcome¶

The best performing model based on cross-validation scores is LogisticRegression with accuracy of 0.8158¶